{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7a1d71ed-d4e6-4d42-abd2-37d557e8542d",
   "metadata": {},
   "source": [
    "# Anomaly (supervised) Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d86871aa-cfe5-46bb-a396-b46f1e827d18",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in /opt/conda/lib/python3.11/site-packages (2.1.1)\n",
      "Requirement already satisfied: numpy in /opt/conda/lib/python3.11/site-packages (1.24.4)\n",
      "Requirement already satisfied: matplotlib in /opt/conda/lib/python3.11/site-packages (3.8.0)\n",
      "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.11/site-packages (1.3.1)\n",
      "Requirement already satisfied: catboost in /opt/conda/lib/python3.11/site-packages (1.2.5)\n",
      "\u001b[31mERROR: Could not find a version that satisfies the requirement catboost-widget (from versions: none)\u001b[0m\u001b[31m\n",
      "\u001b[0m\u001b[31mERROR: No matching distribution found for catboost-widget\u001b[0m\u001b[31m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install pandas numpy matplotlib scikit-learn catboost catboost-widget"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3dfcea12-5f63-4aee-83f9-fefdda199733",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c18c29b9-2266-48c6-894d-05410cc8ccd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('ai4i2020.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "67497ec5-96a9-4766-aafa-a75797804882",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',\n",
       "       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',\n",
       "       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',\n",
       "       'RNF'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b2f3727f-4856-4b18-ad7c-a6dd52f6e1f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features=['Product ID', 'Type']\n",
    "X_features= cat_features +  ['Air temperature [K]',\n",
    "       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',\n",
    "       'Tool wear [min]']\n",
    "y_features=[ 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fc13b36a-60d3-4d1c-804e-dc713d479d81",
   "metadata": {},
   "outputs": [],
   "source": [
    "X=df[X_features]\n",
    "y=df[y_features]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a886b173-a930-4d8c-ae70-5496caa32df9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "656f7d41-7ce5-4632-a874-69ebfa061ab5",
   "metadata": {},
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3d2f7224-3a80-49ca-a95a-4ec0f6daea36",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Machine failure\n",
       "0    9661\n",
       "1     339\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Machine failure'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8d4eafe5-243d-4b7a-b163-5743369630a1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TWF  HDF  PWF  OSF  RNF\n",
       "0    0    0    0    0      9652\n",
       "     1    0    0    0       106\n",
       "     0    1    0    0        80\n",
       "          0    1    0        78\n",
       "1    0    0    0    0        42\n",
       "0    0    0    0    1        18\n",
       "          1    1    0        11\n",
       "     1    0    1    0         6\n",
       "          1    0    0         3\n",
       "1    0    0    1    0         2\n",
       "               0    1         1\n",
       "          1    1    0         1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[y_features].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5ec4a44-d463-4601-b98f-071dbf9c9b20",
   "metadata": {},
   "source": [
    "# Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c5ee4585-05d0-475f-9b78-95ebe5341825",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Report for: TWF\n",
      "0.986\n",
      "0.5452261306532663\n",
      "[[1971   19]\n",
      " [   9    1]]\n",
      "Tool wear [min] is 42% important on TWF\n",
      "Torque [Nm] is 16% important on TWF\n",
      "Air temperature [K] is 14% important on TWF\n",
      "Rotational speed [rpm] is 11% important on TWF\n",
      "Process temperature [K] is 8% important on TWF\n",
      "Type is 6% important on TWF\n",
      "Product ID is 0% important on TWF\n",
      "Report for: HDF\n",
      "0.995\n",
      "0.9974721941354904\n",
      "[[1968   10]\n",
      " [   0   22]]\n",
      "Rotational speed [rpm] is 37% important on HDF\n",
      "Air temperature [K] is 33% important on HDF\n",
      "Process temperature [K] is 16% important on HDF\n",
      "Torque [Nm] is 6% important on HDF\n",
      "Tool wear [min] is 2% important on HDF\n",
      "Type is 2% important on HDF\n",
      "Product ID is 0% important on HDF\n",
      "Report for: PWF\n",
      "0.994\n",
      "0.941921740105393\n",
      "[[1972   10]\n",
      " [   2   16]]\n",
      "Torque [Nm] is 47% important on PWF\n",
      "Rotational speed [rpm] is 34% important on PWF\n",
      "Type is 7% important on PWF\n",
      "Tool wear [min] is 5% important on PWF\n",
      "Air temperature [K] is 3% important on PWF\n",
      "Process temperature [K] is 2% important on PWF\n",
      "Product ID is 0% important on PWF\n",
      "Report for: OSF\n",
      "0.996\n",
      "0.997976732422863\n",
      "[[1969    8]\n",
      " [   0   23]]\n",
      "Tool wear [min] is 36% important on OSF\n",
      "Torque [Nm] is 33% important on OSF\n",
      "Type is 14% important on OSF\n",
      "Rotational speed [rpm] is 8% important on OSF\n",
      "Process temperature [K] is 3% important on OSF\n",
      "Air temperature [K] is 2% important on OSF\n",
      "Product ID is 0% important on OSF\n",
      "Report for: RNF\n",
      "0.9965\n",
      "0.499248496993988\n",
      "[[1993    3]\n",
      " [   4    0]]\n",
      "Air temperature [K] is 21% important on RNF\n",
      "Tool wear [min] is 19% important on RNF\n",
      "Torque [Nm] is 18% important on RNF\n",
      "Rotational speed [rpm] is 17% important on RNF\n",
      "Process temperature [K] is 14% important on RNF\n",
      "Type is 8% important on RNF\n",
      "Product ID is 0% important on RNF\n"
     ]
    }
   ],
   "source": [
    "from catboost import CatBoostClassifier\n",
    "from sklearn import metrics\n",
    "import numpy as np\n",
    "def train_and_test(on, X_train, y_train, X_val, y_val, cat_features,iterations=800):\n",
    "\n",
    "    _y_val = y_val[on].values.flatten()\n",
    "    _y_train = y_train[on].values.flatten()\n",
    "    \n",
    "    clf = CatBoostClassifier(\n",
    "        iterations=iterations,\n",
    "        random_seed=42,\n",
    "        loss_function='Logloss',\n",
    "        cat_features=cat_features,\n",
    "        auto_class_weights='Balanced'\n",
    "    )\n",
    "    \n",
    "    clf.fit(\n",
    "        X_train, _y_train,\n",
    "        verbose=False\n",
    "    )\n",
    "    \n",
    "    y_pred = clf.predict(X_val)\n",
    "    print(\"Report for:\", on)\n",
    "    print(metrics.accuracy_score(_y_val, y_pred))\n",
    "    print(metrics.balanced_accuracy_score(_y_val, y_pred))\n",
    "    print(metrics.confusion_matrix(_y_val, y_pred))\n",
    "\n",
    "    feature_importance = clf.feature_importances_\n",
    "    sorted_idx = np.argsort(-1*feature_importance)\n",
    "    for col,imp in zip(X_train.columns[sorted_idx], feature_importance[sorted_idx]):\n",
    "        print(f\"{col} is {int(imp)}% important on {on}\")\n",
    "\n",
    "for on in y_features:\n",
    "    train_and_test(on, X_train, y_train, X_test, y_test, cat_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "484f3c7c-106f-4519-8b65-fd73fa33eb72",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
